As a former technical recruiter, I was interested in taking a look at the job market for data professionals amidst the pandemic. Here we'll be looking at Data Analyst positions specifically.
You can use this analysis to get an idea about what companies and industries in general have been actively hiring even during the pandemic and where you should focus your job search depending on various factors such as location, company size, etc.
This analysis can also be useful to recruiters, especially staffing agencies, when it comes to finding out information about their competitors and also scoping out new clients.
All of my data comes from Kaggle and you can find the dataset here: https://www.kaggle.com/andrewmvd/data-analyst-jobs
from IPython.display import Image
from IPython.core.display import HTML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#using plotly offline
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()
import plotly.graph_objs as go
import plotly.express as px
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('DataAnalyst.csv')
df.head()
#dropping columns I won't be using
df = df.drop(['Unnamed: 0', 'Founded', 'Competitors', 'Easy Apply'], axis=1)
#taking away the rating from the end of each company name
df['Company Name'] = df['Company Name'].map(lambda x: str(x)[:-4])
#making location names just the State Abbreviation rather than city, state
df['Location'] = df['Location'].map(lambda x: str(x)[-2:])
#making Salary Estimates just the range and taking away any extra info
df['Salary Estimate'] = df['Salary Estimate'].map(lambda x: str(x)[:-16])
#this dataset sets nan values as -1 instead of nan so I am switching this to nan values and then dropping them
df=df.replace(-1,np.nan)
df=df.replace(-1.0,np.nan)
df=df.replace('-1',np.nan)
#dropping rows containing with any values that are NA
df = df.dropna()
#final dataframe
df.head()
#total number of rows
df['Job Title'].count()
from wordcloud import WordCloud
def wordcloud(column):
title = df[column].tolist()
wordcloud = WordCloud(max_font_size=50, max_words=75, background_color="white").generate(' '.join(title))
plt.figure(figsize=(10,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(str(column) + ' Word Cloud \n')
plt.axis("off")
plt.show()
wordcloud('Job Title')
wordcloud('Job Description')
These would be some good keywords to use when searching for data analyst positions across job boards and also that might be good to add to your resume if looking for a data analyst position.
job_title = df['Job Title'].value_counts()
title_count = pd.DataFrame(data=job_title).reset_index()
title_count.rename(columns={'index':'Job Title', 'Job Title':'Total Openings'}, inplace=True)
title_count_head = title_count.head(10)
fig = px.bar(title_count_head, x='Job Title', y='Total Openings', color='Job Title', text='Total Openings',
color_discrete_sequence= px.colors.qualitative.Safe, width=900 , height=600,
title='Job Titles With the Most Openings')
fig.update_traces(textposition='outside')
fig.show()
It looks like the most common Job Titles for Data Analyst positions are 'Data Analyst','Senior Data Analyst', and 'Junior Data Analyst' - so pretty general job titles rather than the more specific/fancier titles.
#creating dataframe for total positions by industry
count = df['Industry'].value_counts()
industry_count = pd.DataFrame(data=count).reset_index()
industry_count.rename(columns={'index':'Industry', 'Industry':'Total'}, inplace=True)
industry_count.head()
#creating dataframe for total positions by sector
count_sec = df['Sector'].value_counts()
sector_count = pd.DataFrame(data=count_sec).reset_index()
sector_count.rename(columns={'index':'Sector', 'Sector':'Total'}, inplace=True)
sector_count.head()
#I will be using pie charts a lot throughout my analysis so I'm defining a function for it
def pie_chart(df, x, y):
fig = px.pie(df, values=x, names=y,
title='Data Analyst Positions by '+ str(y),
labels=y, height = 600, color=y, color_discrete_sequence= px.colors.qualitative.Safe)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
pie_chart(industry_count, 'Total', 'Industry')
pie_chart(sector_count, 'Total', 'Sector')
size = df['Size'].value_counts()
size_count = pd.DataFrame(data=size).reset_index()
size_count.rename(columns={'index':'Size', 'Size':'Number of Companies'}, inplace=True)
fig = px.bar(size_count, x='Size', y='Number of Companies', color='Size', text='Number of Companies',
color_discrete_sequence= px.colors.qualitative.Safe ,
width=900 , height=600, title='Openings by Company Size')
fig.update_traces(textposition='outside')
fig.show()
It's interesting to see that smaller companies (51-200 employees) are actually hiring the most Data Analysts, with larger companies (10,000+ employees) coming in second.
#creating dataframe for total number of positions by location
count2 = df['Location'].value_counts()
location_count = pd.DataFrame(data=count2).reset_index()
location_count.rename(columns={'index':'Location', 'Location':'Total'}, inplace=True)
fig = px.choropleth(location_count, locations = 'Location' , locationmode ="USA-states", scope='usa',
color='Total', title ="Data Analyst Positions by Location", width=900,
height=600, color_continuous_scale=px.colors.sequential.deep)
fig.show()
As you can clearly see on the map - California, Texas, and New York seem to be hottest locations for Data Analyst openings.
#companies with the most data analyst openings
count3 = df['Company Name'].value_counts()
company_count = pd.DataFrame(data=count3).reset_index()
company_count.rename(columns={'index':'Company', 'Company Name':'Total Job Openings'}, inplace=True)
top_10 = company_count.head(10)
#companies with the most data analyst openings not including companies who categorize themselves as staffing
#agencies or IT Services organizations
staffing = df.loc[df['Industry'] != 'Staffing & Outsourcing']
staffing = staffing.loc[staffing['Industry'] != 'IT Services']
count4 = staffing['Company Name'].value_counts()
minus_staffing = pd.DataFrame(data=count4).reset_index()
minus_staffing.rename(columns={'index':'Company', 'Company Name':'Total Job Openings'}, inplace=True)
minus_staffing_top = minus_staffing.head(10)
from plotly.subplots import make_subplots
fig = make_subplots(subplot_titles=("Including Staffing and IT Services Agencies \n","Not Including Staffing and IT Services Agencies \n"),
rows=2, cols=1, vertical_spacing=0.30)
fig.add_trace(go.Bar(x=top_10['Company'], y=top_10['Total Job Openings'],
marker=dict(color=['rgb(136, 204, 238)', 'rgb(204, 102, 119)', 'rgb(221, 204, 119)',
'rgb(17, 119, 51)', 'rgb(51, 34, 136)', 'rgb(170, 68, 153)',
'rgb(68, 170, 153)', 'rgb(153, 153, 51)', 'rgb(136, 34, 85)',
'rgb(102, 17, 0)'])), row=1, col=1)
fig.add_trace(go.Bar(x=minus_staffing_top['Company'], y=minus_staffing_top['Total Job Openings'],
marker=dict(color=['rgb(136, 204, 238)', 'rgb(204, 102, 119)', 'rgb(221, 204, 119)',
'rgb(17, 119, 51)', 'rgb(51, 34, 136)', 'rgb(170, 68, 153)',
'rgb(68, 170, 153)', 'rgb(153, 153, 51)', 'rgb(136, 34, 85)',
'rgb(102, 17, 0)'])), row=2, col=1)
fig.update_layout(showlegend=False, height = 900, width=800,title_text = 'Top 15 Companies with the Most Openings')
fig.show()
Some people prefer not to work with Staffing/Third-Party agencies, so I wanted to show the companies with the most openings with and without companies that categorize themselves as a Staffing/IT Services organization.
Note - some of the companies for the top 10 not including staffing agencies might still include a staffing agency or two simply because these companies don't actually categorize themselves as one.
salary = df['Salary Estimate'].value_counts()
salary_count = pd.DataFrame(data=salary).reset_index()
salary_count.rename(columns={'index':'Salary', 'Salary Estimate':'Total Job Openings'}, inplace=True)
salary_count.sort_values(by='Total Job Openings', ascending=True)
salary_head = salary_count.head(10)
fig = px.bar(salary_head, x='Total Job Openings', y='Salary', orientation='h', color='Salary', height = 500,
text='Total Job Openings',width=800, color_discrete_sequence= px.colors.qualitative.Safe)
fig.update_layout(title_text = 'Total Openings by Salary', showlegend=True)
fig.update_traces(textposition='outside')
fig.show()
#playing around with KNN for the first time
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df_sector = pd.DataFrame(df['Sector'], columns=['Sector'])
df_sector['Sector_Encoded'] = le.fit_transform(df_sector['Sector'])
df_labels = df_sector.drop_duplicates(subset=['Sector'])
df_labels
features=list(zip(df_sector['Sector_Encoded'], df['Rating']))
from sklearn.neighbors import KNeighborsClassifier
#setting KNN=the sqrt of 1560 and setting the training sets
model = KNeighborsClassifier(n_neighbors=39)
model.fit(features, df['Company Name'])
Let's say you want a job in the Education sector at a company with a 4.0 rating. What company has openings that you should take a look at?
#if you look at df_labels you can see Education is encoded as 7 so we would use that number and 4.0 for rating
trial = model.predict([[7, 4.0]])
print(trial)
Now let's say someone else is looking for a Non-Profit position at a top rated company. What company has openings that they should take a look at?
trial2 = model.predict([[5, 5.0]])
print(trial2)